Analyzes potential variables for use in points model


In [1]:
# Imports Packages
import pandas as pd
import numpy as np
from Player.Players import Players
from Regression.Reg_Model import Reg_Model
from Schedule.Stats import Stats
from Schedule.Schedule import Schedule
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Gets List of Players
Player_List = Players('2015-16').players

In [3]:
# Filters player list down to those average 15 minutes a game or more
Player_List_15 = {}
for p in Player_List:
    if np.mean(Player_List[p].game_logs['MIN'])>=15:
        Player_List_15[p] = Player_List[p]
print len(Player_List), len(Player_List_15)


450 313

In [7]:
# Creates log of all games and creates last-n statistics for players
all_stats=pd.DataFrame()
all_game_logs=pd.DataFrame()

for key in Player_List:
    p=Player_List[key]
    game_logs = p.game_logs
    game_logs['PTS_MIN'] = [x*1.0/y for x,y in zip(game_logs['PTS'], game_logs['MIN'])]
    
    all_game_logs = all_game_logs.append(game_logs)
    
    stats = Player_List[key].get_stats(game_logs = game_logs, date_col='GAME_DATE', 
                                       id_col='Game_ID', target_col='PTS', 
                                       col_list=['PTS', 'MIN', 'PTS_MIN', 'AST', 'FGM', 'FGA', 'FG_PCT'], 
                                       n=10, carry_through_list=['MIN', 'AST'])
    #Filters data to those that have at least 10 previous games
    if len(stats>0):
        stats=stats[stats['n_games']>=10]
        stats = stats[stats['MIN']>0]
        all_stats = all_stats.append(stats)

In [8]:
# Correlation of last 10 points and assists with points
# last 10 points has a decent correlation.  So does PTS/MIN
# last 10 assists has a slight correlation
cor = pd.DataFrame()
feature_list = ['PTS_avg_10', 'AST_avg_10', 'PTS_MIN_avg_10']
for f in feature_list:
    cor.set_value('PTS', f, np.corrcoef(x=stats[f], y=stats['PTS'])[0,1])
cor


Out[8]:
PTS_avg_10 AST_avg_10 PTS_MIN_avg_10
PTS 0.148337 -0.038713 0.156244

In [18]:
# Creates plot of last 10 points and points
# Slight positive pattern 
x=stats['PTS_avg_10']
y=stats['PTS']
plt.subplot(1,2,1)
plt.scatter(x, y, alpha=0.5)
plt.subplot(1,2,2)
plt.scatter(x**2, y, alpha=0.5)


Out[18]:
<matplotlib.collections.PathCollection at 0x11b6302d0>

In [21]:
# Creates plot of last 10 assists and points
# Slight negative pattern, but hard to decern
x=stats['AST_avg_10']
y=stats['PTS']
plt.subplot(1,2,1)
plt.scatter(x, y, alpha=0.5)
plt.subplot(1,2,2)
plt.scatter(x**.5, y, alpha=0.5)


Out[21]:
<matplotlib.collections.PathCollection at 0x11b2c7e50>

In [23]:
# Correlation of transformed last 10 points and assists with points
cor = pd.DataFrame()
cor.set_value('PTS', 'PTS squared', np.corrcoef(x=stats['PTS_avg_10']**2, y=stats['PTS'])[0,1])
cor.set_value('PTS', 'AST sq root', np.corrcoef(x=stats['AST_avg_10']**0.5, y=stats['PTS'])[0,1])
cor


Out[23]:
PTS squared AST sq root
PTS 0.147696 -0.032895

In [11]:
# Creates data to analyze Player's own team recent performance on points
sched_2014 = Schedule(b_dt = '10/1/2014')
sched_2015 = Schedule(b_dt = '10/1/2015')

games = sched_2014.get_games().append(sched_2015.get_games()).reset_index(drop=True)

my_team_stats = Stats(games, 'avg', 'GAME_DATE', 'Home Team', 'Away Team', 'Pts_diff', ['Game_ID', 'GAME_DATE', 'Home Team', 'Away Team'])
#Creates team stats for last 10 games
last_10_team_stats = my_team_stats.get_lastn_stats(10)
#Creates team stats for last 100 games
last_100_team_stats = my_team_stats.get_lastn_stats(100)

#Combines two stats to get trends
team_stats_10 = last_10_team_stats[['Game_ID', 'GAME_DATE', 'Home Team', 'Away Team', 'H_PTS_10', 'A_PTS_10', 'H_BTB', 'A_BTB']]
team_stats_100 = last_100_team_stats[['Game_ID', 'H_PTS_100', 'A_PTS_100']]
team_stats = pd.merge(team_stats_10, team_stats_100, on='Game_ID')
team_stats['H_PTS_Trend'] = [(a-b)*1.0/b for a,b in zip(team_stats['H_PTS_10'], team_stats['H_PTS_100'])]
team_stats['A_PTS_Trend'] = [(a-b)*1.0/b for a,b in zip(team_stats['A_PTS_10'], team_stats['A_PTS_100'])]

#Splits it out by team instead of by game
print len(team_stats)
h_stats = team_stats[['Game_ID', 'GAME_DATE', 'Home Team', 'H_PTS_Trend', 'H_BTB']].rename(index = str, columns={'Home Team':'Team', 'H_PTS_Trend':'PTS_Trend', 'H_BTB':'BTB'})
print len(h_stats)
a_stats = team_stats[['Game_ID', 'GAME_DATE', 'Away Team', 'A_PTS_Trend', 'A_BTB']].rename(index = str, columns={'Away Team':'Team', 'A_PTS_Trend':'PTS_Trend', 'A_BTB':'BTB'})
print len(a_stats)
team_stats_sep = h_stats.append(a_stats).reset_index(drop=True)
print len(team_stats_sep)


2460
2460
2460
4920

In [12]:
# Joins team data and player game logs to get dataset

all_game_logs['Team'] = [a[0:3] for a in all_game_logs['MATCHUP']]
all_game_logs['GAME_DATE'] = pd.to_datetime(all_game_logs['GAME_DATE'])
team_stats_sep['GAME_DATE'] = pd.to_datetime(team_stats_sep['GAME_DATE'])
team_data = pd.merge(all_game_logs, team_stats_sep, on=['Team', 'GAME_DATE'])
print len(all_game_logs), len(team_stats_sep), len(team_data)


25430 4920 25430

In [13]:
# Correlation of Team's Points Trend and BTB games with Points
# almost no correlation with PTS Trend. BTB is very slight, but in right direction
cor = pd.DataFrame()
feature_list = ['PTS_Trend', 'BTB']
for f in feature_list:
    cor.set_value('PTS', f, np.corrcoef(x=team_data[f], y=team_data['PTS'])[0,1])
cor


Out[13]:
PTS_Trend BTB
PTS -0.000628 -0.001655

In [24]:
# Looks pretty symmetrical
x=team_data['PTS_Trend']
y=team_data['PTS']
plt.subplot(1,2,1)
plt.scatter(x, y, alpha=0.5)
plt.subplot(1,2,2)
plt.scatter(x**2, y, alpha=0.5)


Out[24]:
<matplotlib.collections.PathCollection at 0x11de053d0>

In [15]:
# Looks like players with BTB games have lower points
x=team_data['BTB']
y=team_data['PTS']
plt.scatter(x, y, alpha=0.5)


Out[15]:
<function matplotlib.pyplot.show>

In [25]:
# Correlation of Transformed Team's Points Trend
cor = pd.DataFrame()
cor.set_value('PTS', 'PTS_Trend', np.corrcoef(x=team_data['PTS_Trend']**2, y=team_data['PTS'])[0,1])
cor


Out[25]:
PTS_Trend
PTS 0.011913

In [27]:
# Compares differences in means between back to back games
print 'No BTB', np.mean(team_data[team_data['BTB']==0]['PTS'])
print 'BTB', np.mean(team_data[team_data['BTB']==1]['PTS'])


No BTB 9.83247280565
BTB 9.80094873198

In [45]:
# Creates data to analyze Player's opposing team recent performance on points

# Joins team data and player game logs to get dataset
team_stats_sep_opp = team_stats_sep.copy().rename(index=str, columns={'PTS_Trend': 'O_PTS_Trend', 'BTB':'O_BTB'})[['Team', 'GAME_DATE', 'O_PTS_Trend', 'O_BTB']]
all_game_logs['Opp Team'] = [a.split(' ')[-1][0:3] for a in all_game_logs['MATCHUP']]
team_data = pd.merge(all_game_logs, team_stats_sep, left_on=['Opp Team', 'GAME_DATE'], right_on=['Team', 'GAME_DATE'], suffixes=['','_O'])
print len(all_game_logs), len(team_stats_sep), len(team_data)


25430 4920 25430

In [46]:
team_data.columns


Out[46]:
Index([u'SEASON_ID', u'Player_ID', u'Game_ID', u'GAME_DATE', u'MATCHUP', u'WL',
       u'MIN', u'FGM', u'FGA', u'FG_PCT', u'FG3M', u'FG3A', u'FG3_PCT', u'FTM',
       u'FTA', u'FT_PCT', u'OREB', u'DREB', u'REB', u'AST', u'STL', u'BLK',
       u'TOV', u'PF', u'PTS', u'PLUS_MINUS', u'VIDEO_AVAILABLE', u'PTS_MIN',
       u'Team', u'Opp Team', u'Game_ID_O', u'Team_O', u'PTS_Trend', u'BTB'],
      dtype='object')

In [47]:
# Correlation of other Team's Points Trend and BTB games with Points
# almost no correlation with PTS Trend. BTB is very slight, but in right direction
cor = pd.DataFrame()
feature_list = ['PTS_Trend', 'BTB']
for f in feature_list:
    cor.set_value('PTS', f, np.corrcoef(x=team_data[f], y=team_data['PTS'])[0,1])
cor


Out[47]:
PTS_Trend BTB
PTS -0.006996 0.002678

In [41]:
# Looks pretty symmetrical
x=team_data['PTS_Trend']
y=team_data['PTS']
plt.subplot(1,2,1)
plt.scatter(x, y, alpha=0.5)
plt.subplot(1,2,2)
plt.scatter(x**2, y, alpha=0.5)


Out[41]:
<matplotlib.collections.PathCollection at 0x11efb5f10>

In [42]:
# Looks like players with BTB games have lower points
x=team_data['BTB']
y=team_data['PTS']
plt.scatter(x, y, alpha=0.5)


Out[42]:
<matplotlib.collections.PathCollection at 0x122d42b10>

In [43]:
# Correlation of Transformed Team's Points Trend
cor = pd.DataFrame()
cor.set_value('PTS', 'PTS_Trend', np.corrcoef(x=team_data['PTS_Trend']**2, y=team_data['PTS'])[0,1])
cor


Out[43]:
PTS_Trend
PTS 0.002721

In [44]:
# Compares differences in means between back to back games
print 'No BTB', np.mean(team_data[team_data['BTB']==0]['PTS'])
print 'BTB', np.mean(team_data[team_data['BTB']==1]['PTS'])


No BTB 9.81460815362
BTB 9.86545060502

In [ ]:
# Need to fix adding opponent team data